Liu dataset GSE59288

Note: Human part of the dataset consists of autistic samples only. Control samples were taken from GSE51264

Load data

Metadata downloaded using getGEO, expression data downloaded manually from the Supplementary file GSE59288_exp_mRNA.txt.gz and gene information extracted using biomaRt

# LOAD METADATA
if(!file.exists('./../Data/Voineagu/GSE59288_exp_mRNA.txt.gz')){
  GSE59288 = getGEO('GSE59288', destdir='./../Data/Liu/')  
} else {
  GSE59288 = getGEO(filename='./../Data/Liu/GSE59288_exp_mRNA.txt.gz')
}
# GEO entry has two datasets, keeping the human one
GPL11154 = GSE59288$`GSE59288-GPL11154_series_matrix.txt.gz`

datMeta = pData(GPL11154)
datMeta$ID = rownames(datMeta)
datMeta$age = round(as.numeric(gsub(' days', '', datMeta$`age:ch1`))/365)
datMeta$age_group = cut(datMeta$age, c(-0.5, 0, 0.6, 10, 20, 30, 40, 50, 60, 70, 80),
                    labels=c('Fetal','Infant','Child','10-20','20s','30s','40s','50s','60s','70s'))


# LOAD EXPRESSION DATA
gz_file = gzfile('./../Data/Liu/GSE59288_exp_mRNA.txt.gz','rt')
datExpr = read.delim(gz_file) %>% dplyr::select(aut1:aut34)
colnames(datExpr) = rownames(datMeta)


# ANNOTATE GENES
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv')


# GO Annotations
GO_annotations = read.csv('./../Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID' = as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal' = 1)


# Combine SFARI and GO information
gene_info = data.frame('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID') %>% 
            mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
            left_join(GO_neuronal, by='ID') %>% 
            mutate('gene.score'=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`)) %>%
            mutate('gene.score'=ifelse(is.na(gene.score), 'None', gene.score))


rm(GSE59288, GPL11154, gz_file, getinfo, mart)

Check sample distribution

RNA-Seq for 34 brain-tissue samples from the superior frontal gyrus, all belonging to subjects with ASD

print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples, each belonging to a different subject.'))
## [1] "Dataset includes 12557 genes from 34 samples, each belonging to a different subject."


Sex distribution: There are twice as many males than females

table(datMeta$`gender:ch1`)
## 
## female   male 
##     10     24

Age distribution: Subjects between 2 and 60 years old with a mean close to 21

summary(datMeta$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    9.00   18.50   21.24   29.75   60.00


Filtering


1. Filter genes with start or end position missing

to_keep = !is.na(datGenes$length)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id

print(paste0('Removed ', sum(!to_keep), 'gene, ', sum(to_keep), ' remaining'))
## [1] "Removed 35gene, 12522 remaining"


2. Filter genes with low expression levels

The density distribution doesn’t seem to suggest any clearly defined threshold. Selecting 1.2, which is the point where the slope changes at the beginning of the distribution

plot_data = data.frame('id'=rownames(datExpr), 'mean_expression' = rowMeans(datExpr))
ggplotly(plot_data %>% ggplot(aes(x=mean_expression)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) + 
              geom_vline(xintercept=1.2, color='gray') + scale_x_log10() + 
              ggtitle('gene Mean Expression distribution') + theme_minimal())
to_keep = rowMeans(datExpr)>1.2
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep), ' gene, ', sum(to_keep), ' remaining'))
## [1] "Removed 127 gene, 12395 remaining"


3. Filter outlier samples

Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

  • Using \(s_{ij}=|bw(i,j)|\) to define connectivity between genes.

  • Filtering a single data point

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('ID'=rownames(datMeta), 'sample'=1:length(z.ku), 'distance'=z.ku, 'Age'=datMeta$age_group)
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=Age)) + geom_point() + geom_hline(yintercept=-2, color='gray') + theme_minimal())
print(paste0('Outlier samples: ', paste(as.character(plot_data$ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: GSM1432827"
to_keep = abs(z.ku)<2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

print(paste0('Removed ', sum(!to_keep), ' sample, ', sum(to_keep), ' remaining'))
## [1] "Removed 1 sample, 33 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 12395 genes and 33 samples"



Mean and SD by SFARI score

No clear relation between SFARI score and mean expression/SD. This dataset doesn’t seem to have the same problem as Gandal’s

plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>% 
            left_join(gene_info, by='ID')

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)



Normalisation

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)

We cannot use DESeq2 normalisation because the entries are not counts, but RPKM, so using limma instead

dge = DGEList(counts=datExpr, samples=datMeta, genes=datGenes)

# Calculate Normalisation factors
dge = calcNormFactors(dge)

# Perform Normaliation
if(max(dge$samples$lib.size)/min(dge$samples$lib.size)>3){
    print('Should use voom instead of cpm because the library size ratios are too big')
}
logCPM = cpm(dge, log=TRUE)

# Extract elements
datExpr = logCPM %>% data.frame
datMeta = dge$samples
datGenes = dge$genes

Data seems to be homoscedastic

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

p1 = meanSdPlot(as.matrix(datExpr), plot=FALSE)$gg + theme_minimal()
p2 = plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + 
  geom_smooth(method='lm', color='gray', size=0.5) + theme_minimal()

grid.arrange(p1, p2, ncol=2)

rm(plot_data, p1, p2)



Batch Correction

No Batch information available in the metadata!

Visualisations


Samples

PCA: There doesn’t seem to be a specific pattern related to gender or age

*There weren’t any patterns in MDS and t-SNE either (MDS was the same as PCA)

pca = datExpr %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            left_join(datMeta, by='ID') %>% dplyr::select('PC1','PC2','age','age_group','gender.ch1')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=gender.ch1)) + geom_point() + theme_minimal() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=age)) + geom_point() + theme_minimal() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) + scale_color_viridis() +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

grid.arrange(p1, p2, nrow=1)

rm(pca, plot_data)


Genes

  • First Principal Component explains 86% of the total variance (less than in Gandal and BrainSpan)

  • There’s a really strong correlation between the mean expression of a gene and the 1st principal component

pca = datExpr %>% prcomp

plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() + 
              scale_color_viridis() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)



Mean and SD by SFARI score

No clear relation between SFARI score and mean expression/SD. This dataset doesn’t seem to have the same problem as Gandal’s

plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>% 
            left_join(gene_info, by='ID')

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)




Save preprocessed dataset

save(datExpr, datMeta, datGenes, file='./../Data/Liu/preprocessed_data.RData')



Session info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] hexbin_1.27.2          Rtsne_0.15             vsn_3.50.0            
##  [4] edgeR_3.24.3           limma_3.38.3           WGCNA_1.66            
##  [7] fastcluster_1.1.25     dynamicTreeCut_1.63-1  forcats_0.3.0         
## [10] stringr_1.4.0          dplyr_0.8.0.1          purrr_0.3.1           
## [13] readr_1.3.1            tidyr_0.8.3            tibble_2.1.1          
## [16] tidyverse_1.2.1        viridis_0.5.1          viridisLite_0.3.0     
## [19] gridExtra_2.3          plotlyutils_0.0.0.9000 plotly_4.8.0          
## [22] ggplot2_3.1.0          biomaRt_2.38.0         GEOquery_2.50.5       
## [25] Biobase_2.42.0         BiocGenerics_0.28.0   
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1      htmlTable_1.11.2      base64enc_0.1-3      
##   [4] rstudioapi_0.7        affyio_1.52.0         bit64_0.9-7          
##   [7] AnnotationDbi_1.44.0  mvtnorm_1.0-7         lubridate_1.7.4      
##  [10] xml2_1.2.0            codetools_0.2-15      splines_3.5.2        
##  [13] doParallel_1.0.14     impute_1.56.0         robustbase_0.93-0    
##  [16] knitr_1.22            Formula_1.2-3         jsonlite_1.6         
##  [19] Cairo_1.5-9           broom_0.5.1           cluster_2.0.7-1      
##  [22] GO.db_3.7.0           shiny_1.2.0           BiocManager_1.30.4   
##  [25] rrcov_1.4-3           compiler_3.5.2        httr_1.4.0           
##  [28] backports_1.1.2       assertthat_0.2.1      Matrix_1.2-15        
##  [31] lazyeval_0.2.2        cli_1.1.0             later_0.8.0          
##  [34] acepack_1.4.1         htmltools_0.3.6       prettyunits_1.0.2    
##  [37] tools_3.5.2           affy_1.60.0           gtable_0.2.0         
##  [40] glue_1.3.1            Rcpp_1.0.1            cellranger_1.1.0     
##  [43] preprocessCore_1.44.0 nlme_3.1-137          crosstalk_1.0.0      
##  [46] iterators_1.0.9       xfun_0.5              rvest_0.3.2          
##  [49] mime_0.6              XML_3.98-1.11         DEoptimR_1.0-8       
##  [52] zlibbioc_1.28.0       MASS_7.3-51.1         scales_1.0.0         
##  [55] promises_1.0.1        hms_0.4.2             RColorBrewer_1.1-2   
##  [58] curl_3.3              yaml_2.2.0            memoise_1.1.0        
##  [61] rpart_4.1-13          latticeExtra_0.6-28   stringi_1.4.3        
##  [64] RSQLite_2.1.1         S4Vectors_0.20.1      pcaPP_1.9-73         
##  [67] foreach_1.4.4         checkmate_1.8.5       rlang_0.3.2          
##  [70] pkgconfig_2.0.2       matrixStats_0.54.0    bitops_1.0-6         
##  [73] evaluate_0.13         lattice_0.20-38       labeling_0.3         
##  [76] htmlwidgets_1.3       bit_1.1-14            tidyselect_0.2.5     
##  [79] robust_0.4-18         plyr_1.8.4            magrittr_1.5         
##  [82] R6_2.4.0              IRanges_2.16.0        generics_0.0.2       
##  [85] Hmisc_4.1-1           fit.models_0.5-14     DBI_1.0.0            
##  [88] pillar_1.3.1          haven_1.1.1           foreign_0.8-71       
##  [91] withr_2.1.2           survival_2.43-3       RCurl_1.95-4.10      
##  [94] nnet_7.3-12           modelr_0.1.4          crayon_1.3.4         
##  [97] rmarkdown_1.12        progress_1.2.0        locfit_1.5-9.1       
## [100] grid_3.5.2            readxl_1.1.0          data.table_1.12.0    
## [103] blob_1.1.1            digest_0.6.18         xtable_1.8-3         
## [106] httpuv_1.5.0          stats4_3.5.2          munsell_0.5.0